require(mice)
require(lattice)
library(dplyr)
library(rMIDAS)
# set_python_env(python ="/opt/anaconda3/bin/python")
set_python_env(x ="C:\\ProgramData\\Anaconda3\\",type = "conda")
## [1] TRUE
library(ggplot2)
library(gridExtra)
library("GGally")
data <- fdgs %>% select(-c(id,wgt.z,hgt.z)) %>% na.omit()
head(data)
## reg age sex hgt wgt
## 1 West 13.09514 boy 175.5 75.0
## 2 West 13.81793 boy 148.4 40.0
## 3 West 13.97125 boy 159.9 42.0
## 4 West 13.98220 girl 159.7 46.5
## 5 West 13.52225 girl 160.3 47.8
## 6 East 10.21492 boy 157.8 39.7
ggpairs(data[,c("age","hgt","wgt","sex")], aes(colour = sex, alpha = 0.4))
ggpairs(data[,c("age","hgt","wgt","reg")], aes(colour = reg, alpha = 0.4))
data <- data %>% mutate(sq_age=sqrt(age)) %>% mutate(log_wgt=log2(wgt))
ggpairs(data[,c("sq_age","hgt","log_wgt","sex")], aes(colour = sex, alpha = 0.4))
data <- fdgs %>% select(-c(id,wgt.z,hgt.z)) %>% na.omit()
# data$wgt <- log2(data$wgt)
# data$age <- sqrt(data$age)
head(data)
## reg age sex hgt wgt
## 1 West 13.09514 boy 175.5 75.0
## 2 West 13.81793 boy 148.4 40.0
## 3 West 13.97125 boy 159.9 42.0
## 4 West 13.98220 girl 159.7 46.5
## 5 West 13.52225 girl 160.3 47.8
## 6 East 10.21492 boy 157.8 39.7
miss_data <- add_missingness(data, prop = 0.1)
miss_data <- as.data.frame(miss_data)
miss_index <- which(is.na(miss_data[,"reg"]))
# view miss number of miss data by coluemns
print(sapply(miss_data, function(x) sum(is.na(x))))
## reg age sex hgt wgt
## 990 998 956 973 1046
# md.pattern(miss_data)
The number of trees has a significant impact on the performance of the model.
library(missRanger)
# impt_ranger_data <- missRanger(miss_data, num.trees = 100, verbose = 0)
# impt_ranger_data$reg <- as.integer(impt_ranger_data$reg) # ranger produce float number of categorical
# impt_ranger_data$sex <- as.integer(impt_ranger_data$sex)
impt_ranger_data <- replicate(
10,
as.data.frame(missRanger(miss_data, verbose = 0, num.trees = 100)),
simplify = FALSE
)
for (i in 1:10){
impt_ranger_data[[i]][,"sex"] <- as.integer(impt_ranger_data[[i]][,"sex"])
impt_ranger_data[[i]][,"reg"] <- as.integer(impt_ranger_data[[i]][,"reg"])
}
For imputation purposes, it is expedient to select a suitable subset of data that contains no more than 15 to 25 variables. -doc page 22
imp <- mice(miss_data, print=F)
meth <- imp$meth
meth[c('sex','reg')] <- "rf"
meth[c('age','hgt','wgt')] <- 'rf'
imp <- mice(miss_data, m=10, method = meth, print=F)
imp20 <- mice.mids(imp, maxit=15, print=F)
impt_mice_data <- list()
for (i in 1:10){
impt_mice <- mice::complete(imp20,action=i)
impt_mice_data <- append(impt_mice_data,list(impt_mice))
}
col_bin <- c('sex')
col_cat <- c('reg')
# Apply rMIDAS preprocessing steps
data_conv <- rMIDAS::convert(miss_data,
bin_cols = col_bin,
cat_cols = col_cat,
minmax_scale = TRUE)
# Train the model for 20 epochs
rmidas_train <- rMIDAS::train(data_conv,
training_epochs = 50,
layer_structure = c(128,256,128),
input_drop = 0.75,
seed = 89)
## Initialising Python connection
# Generate 10 imputed datasets
impt_rmidas_data <- rMIDAS::complete(rmidas_train, m = 10,fast = TRUE)
## Imputations generated. Completing post-imputation transformations.
create_compare_data <- function(df,miss_df,impt_df_list,col,m=10,method="mice"){
# we only need to compare the missing values
miss_df <- as.data.frame(miss_df)
miss_index <- which(is.na(miss_df[,col]))
df <- df[miss_index,]
df["source"] <- rep("True",length(miss_index))
for(i in 1:m){
df2 <- impt_df_list[[i]]
df2 <- df2[miss_index,]
df2["source"] <- rep(method,length(miss_index))
# df2 <- df2[df2$age>=0,]
df <- rbind(df2,df)
}
df
}
df_mice_wgt <- create_compare_data(data,miss_data,impt_mice_data,col = "wgt",method = "mice")
ggplot(df_mice_wgt, aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()
df_ranger_wgt <- create_compare_data(data,miss_data,impt_ranger_data,col = "wgt",method = "ranger")
ggplot(df_ranger_wgt, aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()
df_mida_wgt <- create_compare_data(data,miss_data,impt_rmidas_data,col = "wgt",method = "midas")
ggplot(df_mida_wgt, aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()
df_mice_hgt <- create_compare_data(data,miss_data,impt_mice_data,col = "hgt",method = "mice")
ggplot(df_mice_hgt, aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()
df_ranger_hgt <- create_compare_data(data,miss_data,impt_ranger_data,col = "hgt",method = "ranger")
ggplot(df_ranger_hgt, aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()
As we can see from the plot, MIDAS imputed negative values for ages, which does not make sense.
df_mida_hgt <- create_compare_data(data,miss_data,impt_rmidas_data,col = "hgt",method = "midas")
ggplot(df_mida_hgt, aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()
As we can see from the plot, MIDAS imputed negative values for ages, which does not make sense.
df <- impt_rmidas_data[[1]]
ggpairs(df[,c("age","hgt","wgt","sex")], aes(colour = sex, alpha = 0.4))
create_compare_data_nasty <- function(col,m=10){
miss_data <- as.data.frame(miss_data)
miss_index <- which(is.na(miss_data[,col]))
data["Methods"] <- rep("True")
data <- data[miss_index,]
for (i in 1:m){
df_midas <- impt_rmidas_data[[i]]
df_midas["Methods"] <- "midas"
data <- rbind(data,df_midas[miss_index,])
df_mice <- impt_mice_data[[i]]
df_mice["Methods"] <- "mice"
data <- rbind(data,df_mice[miss_index,])
df_ranger <- impt_ranger_data[[i]]
df_ranger["Methods"] <- "ranger"
data <- rbind(data,df_ranger[miss_index,])
}
data
}
data_nas <- create_compare_data_nasty(col="hgt")
p <- ggplot(data_nas, aes(factor(Methods), hgt))
p+geom_violin(aes(fill = Methods))+geom_boxplot(width=0.1)+
stat_summary(fun=mean, geom="point", size=2, color="red")
ggplot(data_nas, aes(age,hgt, colour = Methods))+geom_point(alpha=0.4)+stat_smooth()
data_nas <- create_compare_data_nasty(col="wgt")
p <- ggplot(data_nas, aes(factor(Methods), wgt))
p+geom_violin(aes(fill = Methods))+geom_boxplot(width=0.1)+
stat_summary(fun=mean, geom="point", size=2, color="red")
ggplot(data_nas, aes(age,wgt, colour = Methods))+geom_point(alpha=0.4)+stat_smooth()
data_nas <- create_compare_data_nasty(col="age")
p <- ggplot(data_nas, aes(factor(Methods), age))
p+geom_violin(aes(fill = Methods))+geom_boxplot(width=0.1)+
stat_summary(fun=mean, geom="point", size=2, color="red")
library(scales)
library(caret)
library(gdata)
ggplotConfusionMatrix <- function(m, col_names){
#https://stackoverflow.com/questions/51410405/ggplot2-confusion-matrix-geom-text-labeling
mytitle <- paste("Accuracy", percent_format()(m$overall[1]),
"Kappa", percent_format()(m$overall[2]))
data_c <- mutate(group_by(as.data.frame(m$table), Reference ), percentage =
percent(Freq/sum(Freq)))
p <-
ggplot(data = data_c,
aes(x = Reference, y = Prediction)) +
geom_tile(aes(fill = Freq), colour = "white") +
scale_fill_gradient(low = "white", high = "green") +
geom_text(aes(x = Reference, y = Prediction, label = percentage)) +
scale_x_discrete(labels=col_names)+
scale_y_discrete(labels=col_names)+
# theme(legend.position = "none") +
ggtitle(mytitle)
return(p)
}
plot_confusion_matrix <- function(impt_data_list, data, miss_df, col, m=10){
miss_df <- as.data.frame(miss_df)
miss_index <- which(is.na(miss_df[,col]))
pred_values <- c()
for (i in 1:m){
pred <- impt_data_list[[i]]
pred <- pred[,col]
pred_values <- c(pred_values,pred[miss_index])
}
true_labels <- data[miss_index,col]
pred_values <- as.factor(pred_values)
true_labels <- as.factor(as.numeric(as.factor(rep(true_labels,m))))
pred_values <- factor(pred_values,levels = levels(true_labels))
true_labels <- factor(true_labels,levels = levels(true_labels))
cfm <- confusionMatrix(true_labels,pred_values)
map <- mapLevels(x=as.factor(data[,col]))
ggplotConfusionMatrix(cfm,names(map))
}
plot_confusion_matrix(impt_mice_data,data,miss_data,col = "sex")
plot_confusion_matrix(impt_ranger_data,data,miss_data,col = "sex")
plot_confusion_matrix(impt_rmidas_data,data,miss_data,col = "sex")
plot_confusion_matrix(impt_mice_data,data,miss_data,col = "reg")
plot_confusion_matrix(impt_ranger_data,data,miss_data,col = "reg")
## Warning: Removed 15 rows containing missing values (geom_text).
plot_confusion_matrix(impt_rmidas_data,data,miss_data,col = "reg")